# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load and explore the dataset
df = pd.read_csv('UFO_Sightings_df.csv')  # Replace with your actual file path

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nData Types:")
print(df.dtypes)
print("\nFirst 5 rows:")
df.head()

Dataset Shape: (88875, 12)

Column Names:
['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)', 'duration (hours/min)', 'comments', 'date posted', 'latitude', 'longitude', 'Unnamed: 11']

Data Types:
datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)       object
duration (hours/min)     object
comments                 object
date posted              object
latitude                 object
longitude               float64
Unnamed: 11             float64
dtype: object

First 5 rows:

# Data Cleaning and Preparation
# Convert datetime columns
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
df['date posted'] = pd.to_datetime(df['date posted'], errors='coerce')

# Extract time features for analysis
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.day_name()
df['dayofweek_num'] = df['datetime'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin(['Friday', 'Saturday', 'Sunday'])

# Clean duration column - convert to numeric
df['duration_seconds'] = pd.to_numeric(df['duration (seconds)'], errors='coerce')

# Remove rows with missing critical data
df_clean = df.dropna(subset=['datetime', 'city', 'state'])

# Filter for reasonable years (1950-2023)
df_clean = df_clean[(df_clean['year'] >= 1950) & (df_clean['year'] <= 2023)]

print(f"Original dataset: {len(df)} rows")
print(f"Cleaned dataset: {len(df_clean)} rows")
print(f"Removed: {len(df) - len(df_clean)} rows ({((len(df) - len(df_clean))/len(df)*100):.2f}%)")

Original dataset: 88875 rows
Cleaned dataset: 80152 rows
Removed: 8723 rows (9.81%)

# Plot 1 - The Weekend Effect (Bar Plot)
# This plot will highlight the weekend in orange, the weekdays in blue. Then we will be able to see if more sightings occur in orange regions.
plt.figure(figsize=(12, 6))

# Order days properly
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = df_clean['dayofweek'].value_counts().reindex(day_order)

# Create bar plot with different colors for weekends
colors = ['skyblue', 'skyblue', 'skyblue', 'skyblue', 'lightcoral', 'lightcoral', 'lightcoral']
bars = plt.bar(day_order, day_counts.values, color=colors)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height):,}',
             ha='center', va='bottom')

plt.title('UFO Sightings by Day of Week: The Weekend Effect', fontsize=16, fontweight='bold')
plt.xlabel('Day of Week', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)
plt.xticks(rotation=45)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='skyblue', label='Weekday'),
                  Patch(facecolor='lightcoral', label='Weekend')]
plt.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.show()

# Calculate weekend vs weekday statistics
weekend_sightings = df_clean[df_clean['is_weekend']].shape[0]
weekday_sightings = df_clean[~df_clean['is_weekend']].shape[0]
weekend_avg = weekend_sightings / 3  # 3 weekend days
weekday_avg = weekday_sightings / 4  # 4 weekdays

print(f"\nWeekend Analysis:")
print(f"Total weekend sightings: {weekend_sightings:,}")
print(f"Total weekday sightings: {weekday_sightings:,}")
print(f"Average per weekend day: {weekend_avg:,.0f}")
print(f"Average per weekday: {weekday_avg:,.0f}")
print(f"Weekend boost: {((weekend_avg - weekday_avg) / weekday_avg * 100):.1f}%")

Weekend Analysis:
Total weekend sightings: 37,284
Total weekday sightings: 42,868
Average per weekend day: 12,428
Average per weekday: 10,717
Weekend boost: 16.0%

# Plot 2 - Geographic Distribution (Horizontal Bar Plot)
# This plot will show the top US states in which UFO sightings where reported

# Filter for US sightings and get top 20 states
us_sightings = df_clean[df_clean['country'] == 'us'].copy()
top_states = us_sightings['state'].value_counts().head(20)

plt.figure(figsize=(10, 12))
plt.barh(top_states.index, top_states.values, color='darkgreen')

# Add value labels
for i, v in enumerate(top_states.values):
    plt.text(v + 50, i, f'{v:,}', va='center')

plt.title('Top 20 US States by UFO Sightings', fontsize=16, fontweight='bold')
plt.xlabel('Number of Sightings', fontsize=12)
plt.ylabel('State', fontsize=12)
plt.tight_layout()
plt.show()

print(f"Top 5 UFO Hotspots:")
for i, (state, count) in enumerate(top_states.head().items(), 1):
    print(f"{i}. {state.upper()}: {count:,} sightings")

Top 5 UFO Hotspots:
1. CA: 9,461 sightings
2. WA: 4,238 sightings
3. FL: 4,116 sightings
4. TX: 3,682 sightings
5. NY: 3,191 sightings

# Plot 3 - Sightings Over Time (Line Plot with Date Splitting)
# This plot will show the recorded UFO sightings over time from 1950-2023

# Group by year and count sightings
yearly_sightings = df_clean.groupby('year').size()

plt.figure(figsize=(14, 7))
plt.plot(yearly_sightings.index, yearly_sightings.values, linewidth=2, color='purple')
plt.fill_between(yearly_sightings.index, yearly_sightings.values, alpha=0.3, color='purple')

# Mark significant years
plt.axvline(x=1969, color='red', linestyle='--', alpha=0.5, label='Moon Landing')
plt.axvline(x=1989, color='orange', linestyle='--', alpha=0.5, label='Digital Camera Commercial Release')
plt.axvline(x=2000, color='green', linestyle='--', alpha=0.5, label='The First Mass-Market Camera Phone')

plt.title('UFO Sightings Over Time (1950-2023)', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Find peak years
peak_years = yearly_sightings.nlargest(5)
print("Peak UFO Sighting Years:")
for year, count in peak_years.items():
    print(f"{year}: {count:,} sightings")

Peak UFO Sighting Years:
2012.0: 7,470 sightings
2013.0: 7,228 sightings
2011.0: 5,199 sightings
2008.0: 4,735 sightings
2009.0: 4,378 sightings

# Plot 4 - Time of Day Analysis (Histogram with Weekend Filter)
# This plot will show the time of day in which reports were given and then compare weekday vs weekend reports for time.

# Filter for weekend vs weekday
weekend_hours = df_clean[df_clean['is_weekend']]['hour']
weekday_hours = df_clean[~df_clean['is_weekend']]['hour']

plt.figure(figsize=(12, 6))
bins = range(0, 25)

plt.hist([weekday_hours, weekend_hours], bins=bins, label=['Weekday', 'Weekend'], 
         alpha=0.7, color=['blue', 'red'], edgecolor='black')

plt.title('UFO Sightings by Hour of Day: Weekend vs Weekday', fontsize=16, fontweight='bold')
plt.xlabel('Hour of Day (24-hour format)', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)
plt.legend()
plt.xticks(range(0, 24))
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate late night sightings (10 PM - 4 AM)
late_night = df_clean[(df_clean['hour'] >= 22) | (df_clean['hour'] <= 4)]
late_night_weekend = late_night[late_night['is_weekend']].shape[0]
late_night_weekday = late_night[~late_night['is_weekend']].shape[0]

print(f"\nLate Night Analysis (10 PM - 4 AM):")
print(f"Weekend late night sightings: {late_night_weekend:,}")
print(f"Weekday late night sightings: {late_night_weekday:,}")
print(f"Ratio: {late_night_weekend/late_night_weekday:.2f}x more on weekends")

Late Night Analysis (10 PM - 4 AM):
Weekend late night sightings: 15,498
Weekday late night sightings: 16,579
Ratio: 0.93x more on weekends

# Plot 5 - UFO Shapes (Pie Chart)
# This plot will show the reported UFO shapes

# Get top 10 shapes
top_shapes = df_clean['shape'].value_counts().head(10)

plt.figure(figsize=(10, 8))
colors = plt.cm.Set3(range(len(top_shapes)))
plt.pie(top_shapes.values, labels=top_shapes.index, autopct='%1.1f%%', 
        colors=colors, startangle=90)
plt.title('Top 10 Most Reported UFO Shapes', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("Top 5 UFO Shapes:")
for shape, count in top_shapes.head().items():
    print(f"{shape.capitalize()}: {count:,} sightings ({count/len(df_clean)*100:.1f}%)")

Top 5 UFO Shapes:
Light: 16,385 sightings (20.4%)
Triangle: 7,853 sightings (9.8%)
Circle: 7,512 sightings (9.4%)
Fireball: 6,057 sightings (7.6%)
Unknown: 5,758 sightings (7.2%)

# Plot 6 - Seasonal Pattern (Bar Plot with Month Splitting)
# This plot will show UFO reports by seasons in the USA

monthly_sightings = df_clean.groupby('month').size()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

plt.figure(figsize=(12, 6))
# Color by season
colors = ['lightblue' if m in [12, 1, 2] else 'lightgreen' if m in [3, 4, 5] 
          else 'yellow' if m in [6, 7, 8] else 'orange' 
          for m in range(1, 13)]

bars = plt.bar(month_names, monthly_sightings.values, color=colors, edgecolor='black')

# Add value labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height):,}',
             ha='center', va='bottom')

plt.title('UFO Sightings by Month: Seasonal Patterns', fontsize=16, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)

# Add season legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='lightblue', label='Winter'),
                  Patch(facecolor='lightgreen', label='Spring'),
                  Patch(facecolor='yellow', label='Summer'),
                  Patch(facecolor='orange', label='Autumn')]
plt.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.show()

# Calculate seasonal statistics
winter = monthly_sightings[[12, 1, 2]].sum()
spring = monthly_sightings[[3, 4, 5]].sum()
summer = monthly_sightings[[6, 7, 8]].sum()
autumn = monthly_sightings[[9, 10, 11]].sum()

print("Seasonal Analysis:")
print(f"Winter: {winter:,} sightings")
print(f"Spring: {spring:,} sightings")
print(f"Summer: {summer:,} sightings")
print(f"Autumn: {autumn:,} sightings")
print(f"\nPeak season: Summer with {summer/len(df_clean)*100:.1f}% of all sightings")

Seasonal Analysis:
Winter: 15,879 sightings
Spring: 16,119 sightings
Summer: 26,123 sightings
Autumn: 22,031 sightings

Peak season: Summer with 32.6% of all sightings

# Plot 7 - Duration Analysis (Box Plot with Weekend Comparison)
# This plot will show how long on average the reporters UFO event lasted

# Filter for reasonable durations (less than 30 minutes = 1800 seconds for better visualization)
duration_filtered = df_clean[(df_clean['duration_seconds'] > 0) & 
                            (df_clean['duration_seconds'] < 1800)].copy()

# Create figure
plt.figure(figsize=(10, 6))

# Box plot comparing weekend vs weekday durations
plt.boxplot([duration_filtered[~duration_filtered['is_weekend']]['duration_seconds'],
             duration_filtered[duration_filtered['is_weekend']]['duration_seconds']],
            labels=['Weekday', 'Weekend'],
            patch_artist=True,
            boxprops=dict(facecolor='lightblue', color='darkblue'),
            medianprops=dict(color='red', linewidth=2),
            whiskerprops=dict(color='darkblue'),
            capprops=dict(color='darkblue'))

plt.title('UFO Sighting Duration: Weekend vs Weekday', fontsize=14, fontweight='bold')
plt.ylabel('Duration (seconds)', fontsize=12)
plt.grid(axis='y', alpha=0.3)

# Group into time periods for clearer visualization
duration_filtered['time_period'] = pd.cut(duration_filtered['hour'], 
                                         bins=[-1, 6, 12, 18, 24],
                                         labels=['Night (12AM-6AM)', 'Morning (6AM-12PM)', 
                                                'Afternoon (12PM-6PM)', 'Evening (6PM-12AM)'])

# Calculate median duration by time period and weekend status
time_period_data = duration_filtered.groupby(['time_period', 'is_weekend'])['duration_seconds'].agg(['median', 'count']).reset_index()

# Show the plot
plt.show()

# Statistical summary
print("Duration Analysis Summary:")
print("=" * 50)

# Overall statistics
print(f"\nOverall Statistics (durations < 30 minutes):")
print(f"Average duration: {duration_filtered['duration_seconds'].mean():.0f} seconds")
print(f"Median duration: {duration_filtered['duration_seconds'].median():.0f} seconds")

# Weekend vs Weekday comparison
weekend_mean = duration_filtered[duration_filtered['is_weekend']]['duration_seconds'].mean()
weekday_mean = duration_filtered[~duration_filtered['is_weekend']]['duration_seconds'].mean()
weekend_median = duration_filtered[duration_filtered['is_weekend']]['duration_seconds'].median()
weekday_median = duration_filtered[~duration_filtered['is_weekend']]['duration_seconds'].median()

print(f"\nWeekend vs Weekday:")
print(f"Weekend - Mean: {weekend_mean:.0f}s, Median: {weekend_median:.0f}s")
print(f"Weekday - Mean: {weekday_mean:.0f}s, Median: {weekday_median:.0f}s")
print(f"Difference: {abs(weekend_mean - weekday_mean):.0f}s (mean), {abs(weekend_median - weekday_median):.0f}s (median)")

# Time of day analysis
print(f"\nSightings by Time Period:")
time_counts = duration_filtered['time_period'].value_counts()
for period, count in time_counts.items():
    print(f"{period}: {count:,} sightings ({count/len(duration_filtered)*100:.1f}%)")

Duration Analysis Summary:
==================================================

Overall Statistics (durations < 30 minutes):
Average duration: 273 seconds
Median duration: 120 seconds

Weekend vs Weekday:
Weekend - Mean: 276s, Median: 120s
Weekday - Mean: 271s, Median: 120s
Difference: 6s (mean), 0s (median)

Sightings by Time Period:
Evening (6PM-12AM): 37,171 sightings (57.1%)
Night (12AM-6AM): 12,312 sightings (18.9%)
Afternoon (12PM-6PM): 10,261 sightings (15.8%)
Morning (6AM-12PM): 5,298 sightings (8.1%)

# Plot 8 - Sightings by Hour and Day of Week (Heatmap)
# This plot will show the heatmap of what day and when during it reports came in

hourly_dow = df_clean.groupby(['dayofweek_num', 'hour']).size().unstack(fill_value=0)

plt.figure(figsize=(12, 8))
sns.heatmap(hourly_dow, cmap='YlOrRd', annot=True, fmt='d', 
            xticklabels=range(24),
            yticklabels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.title('UFO Sightings Heatmap: Hour vs Day of Week')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.show()

# Plot 9 - Top UFO Hotspot Cities in the US
# This plot will show the top 20 Cities for UFO reports (different from states)

# Get top 20 cities in the US
us_cities = us_sightings.groupby(['city', 'state']).size().reset_index(name='count')
us_cities['city_state'] = us_cities['city'] + ', ' + us_cities['state'].str.upper()
top_cities = us_cities.nlargest(20, 'count')

plt.figure(figsize=(12, 8))
bars = plt.bar(range(len(top_cities)), top_cities['count'], color='darkred')

# Add value labels
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height):,}',
             ha='center', va='bottom', fontsize=8)

plt.title('Top 20 US Cities for UFO Sightings', fontsize=16, fontweight='bold')
plt.xlabel('City', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)
plt.xticks(range(len(top_cities)), top_cities['city_state'], rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("Top 10 UFO Hotspot Cities:")
for idx, row in top_cities.head(10).iterrows():
    print(f"{row['city_state']}: {row['count']:,} sightings")

Top 10 UFO Hotspot Cities:
seattle, WA: 558 sightings
phoenix, AZ: 478 sightings
las vegas, NV: 388 sightings
los angeles, CA: 365 sightings
san diego, CA: 354 sightings
portland, OR: 349 sightings
houston, TX: 309 sightings
chicago, IL: 291 sightings
tucson, AZ: 257 sightings
miami, FL: 245 sightings

# Summary Statistics and Conclusions
# Summarising all findings from the report.

print("=== UFO SIGHTINGS ANALYSIS SUMMARY ===\n")

print("1. THE WEEKEND EFFECT:")
# Recalculate weekend statistics
weekend_sightings = df_clean[df_clean['is_weekend']].shape[0]
weekday_sightings = df_clean[~df_clean['is_weekend']].shape[0]
weekend_pct = (weekend_sightings / len(df_clean)) * 100

print(f"   - {weekend_pct:.1f}% of all UFO sightings occur on weekends (Fri-Sun)")
print(f"   - Weekend sightings are higher than weekdays")

# Late night statistics
late_night = df_clean[(df_clean['hour'] >= 22) | (df_clean['hour'] <= 4)]
late_night_weekend = late_night[late_night['is_weekend']].shape[0]
late_night_weekday = late_night[~late_night['is_weekend']].shape[0]

if late_night_weekday > 0:
    ratio = late_night_weekend/late_night_weekday
    print(f"   - Late night weekend sightings are {ratio:.2f}x more common\n")
else:
    print("   - Late night weekend sightings: More common\n")

print("2. GEOGRAPHIC HOTSPOTS:")
# Top locations
us_sightings = df_clean[df_clean['country'] == 'us']
top_states = us_sightings['state'].value_counts()

if len(top_states) > 0:
    print(f"   - Top state: {top_states.index[0].upper()} with {top_states.values[0]:,} sightings")
    
us_pct = (len(us_sightings)/len(df_clean)*100)
print(f"   - US accounts for {us_pct:.1f}% of all sightings\n")

print("3. TEMPORAL PATTERNS:")
# Yearly statistics
yearly_sightings = df_clean.groupby('year').size()
print(f"   - Peak year: {yearly_sightings.idxmax()} with {yearly_sightings.max():,} sightings")

# Monthly statistics - Fixed version
monthly_sightings = df_clean.groupby('month').size()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# Get peak month safely
peak_month_num = int(monthly_sightings.idxmax())
peak_month_count = int(monthly_sightings.max())
peak_month_name = month_names[peak_month_num - 1]  # Subtract 1 because months are 1-12 but list is 0-11

print(f"   - Peak month: {peak_month_name} with {peak_month_count:,} sightings")

# Hour statistics
hour_counts = df_clean['hour'].value_counts()
if len(hour_counts) > 0:
    peak_hour = int(hour_counts.index[0])
    print(f"   - Peak hour: {peak_hour}:00\n")

print("4. UFO CHARACTERISTICS:")
# Shape statistics
top_shapes = df_clean['shape'].value_counts()
if len(top_shapes) > 0:
    print(f"   - Most common shape: {top_shapes.index[0]}")

print("\n5. INTERESTING FINDINGS:")
print("   - Summer months show significantly more sightings")
print("   - Sightings dramatically increased after 1990s")
print("   - Coastal states dominate the top sighting locations")
print("   - The 'weekend effect' is real - supporting our hypothesis!")

=== UFO SIGHTINGS ANALYSIS SUMMARY ===

1. THE WEEKEND EFFECT:
   - 46.5% of all UFO sightings occur on weekends (Fri-Sun)
   - Weekend sightings are higher than weekdays
   - Late night weekend sightings are 0.93x more common

2. GEOGRAPHIC HOTSPOTS:
   - Top state: CA with 9,461 sightings
   - US accounts for 86.6% of all sightings

3. TEMPORAL PATTERNS:
   - Peak year: 2012.0 with 7,470 sightings
   - Peak month: Jul with 9,492 sightings
   - Peak hour: 21:00

4. UFO CHARACTERISTICS:
   - Most common shape: light

5. INTERESTING FINDINGS:
   - Summer months show significantly more sightings
   - Sightings dramatically increased after 1990s
   - Coastal states dominate the top sighting locations
   - The 'weekend effect' is real - supporting our hypothesis!

	datetime	city	state	country	shape	duration (seconds)	duration (hours/min)	comments	date posted	latitude	longitude	Unnamed: 11
0	10/10/1949 20:30	san marcos	tx	us	cylinder	2700	45 minutes	This event took place in early fall around 194...	4/27/2004	29.8830556	-97.941111	NaN
1	10/10/1949 21:00	lackland afb	tx	NaN	light	7200	1-2 hrs	1949 Lackland AFB&#44 TX. Lights racing acros...	12/16/2005	29.38421	-98.581082	NaN
2	10/10/1955 17:00	chester (uk/england)	NaN	gb	circle	20	20 seconds	Green/Orange circular disc over Chester&#44 En...	1/21/2008	53.2	-2.916667	NaN
3	10/10/1956 21:00	edna	tx	us	circle	20	1/2 hour	My older brother and twin sister were leaving ...	1/17/2004	28.9783333	-96.645833	NaN
4	10/10/1960 20:00	kaneohe	hi	us	light	900	15 minutes	AS a Marine 1st Lt. flying an FJ4B fighter/att...	1/22/2004	21.4180556	-157.803611	NaN

UFO Sighting Analysis¶